#ifndef F70CC480_E6E6_43C1_A7D6_3EEB74F05088
#define F70CC480_E6E6_43C1_A7D6_3EEB74F05088

// Copyright © 2024 Conor Williams <conorwilliams@outlook.com>
// Copyright © 2020 Andrey Semashev

// SPDX-License-Identifier: BSL-1.0

// Distributed under the Boost Software License, Version 1.0.

// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
// this license (the "Software") to use, reproduce, display, distribute,
// execute, and transmit the Software, and to prepare derivative works of the
// Software, and to permit third-parties to whom the Software is furnished to
// do so, all subject to the following:
//
// The copyright notices in the Software and this entire statement, including
// the above license grant, this restriction and the following disclaimer,
// must be included in all copies of the Software, in whole or in part, and
// all derivative works of the Software, unless such copies or derivative
// works are solely in the form of machine-executable object code generated by
// a source language processor.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.

#include <atomic>

#include "libfork/core/macro.hpp"

/**
 * @file atomics.hpp
 *
 * @brief A port of part of boost::atomic_thread_fence(seq_cst) to work around clang's bad codegen.
 */

namespace lf::impl {

/**
 * This is a workaround for clang generating bad codegen for ``std::atomic_thread_fence``.
 *
 * See: https://github.com/llvm/llvm-project/issues/91731
 *      https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/
 *      https://github.com/boostorg/atomic/issues/36
 *
 * This is a port of part of boost::atomic_thread_fence(seq_qst), see:
 *
 * https://github.com/boostorg/atomic/blob/5bbcce0f6e855dc4009e2e6977c62e0520c39573/include/boost/atomic/detail/fence_arch_ops_gcc_x86.hpp
 * https://github.com/boostorg/atomic/blob/5bbcce0f6e855dc4009e2e6977c62e0520c39573/include/boost/atomic/detail/platform.hpp
 *
 */
LF_FORCEINLINE inline void thread_fence_seq_cst() {

#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && defined(__clang__)
  /**
   * We could generate mfence for a seq_cst fence here, but a dummy lock-prefixed instruction is enough and is
   * faster than mfence on most modern x86 CPUs (as of 2020). Note that we want to apply the atomic operation
   * on any location so that:
   * - It is not shared with other threads. A variable on the stack suits this well.
   * - It is likely in cache. Being close to the top of the stack fits this well.
   * - It does not alias existing data on the stack, so that we don't introduce a false data dependency.
   * See some performance data here: https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
   * Unfortunately, to make tools like valgrind happy, we have to initialize the dummy, which is otherwise
   * not needed.
   */
  unsigned char dummy = 0u;
  __asm__ __volatile__("lock; notb %0" : "+m"(dummy) : : "memory");
#else
  std::atomic_thread_fence(std::memory_order_seq_cst);
#endif
}

} // namespace lf::impl

#endif /* F70CC480_E6E6_43C1_A7D6_3EEB74F05088 */
